In [9]:

import pandas

bike_rentals = pandas.read_csv("bike_rental_hour.csv")
bike_rentals.head()

Out[9]:

	instant	dteday	season	mnth	hr	weekday	weathersit	temp	atemp	hum	casual	registered	cnt
0	1	2011-01-01	1	1	0	6	1	0.24	0.2879	0.81	3	13	16
1	2	2011-01-01	1	1	1	6	1	0.22	0.2727	0.80	8	32	40
2	3	2011-01-01	1	1	2	6	1	0.22	0.2727	0.80	5	27	32
3	4	2011-01-01	1	1	3	6	1	0.24	0.2879	0.75	3	10	13
4	5	2011-01-01	1	1	4	6	1	0.24	0.2879	0.75	0	1	1

In [10]:

%matplotlib inline

import matplotlib.pyplot as plt

plt.hist(bike_rentals["cnt"])

Out[10]:

(array([ 6972.,  3705.,  2659.,  1660.,   987.,   663.,   369.,   188.,
          139.,    37.]),
 array([   1. ,   98.6,  196.2,  293.8,  391.4,  489. ,  586.6,  684.2,
         781.8,  879.4,  977. ]),
 <a list of 10 Patch objects>)

In [11]:

bike_rentals.corr()["cnt"]

Out[11]:

instant       0.278379
season        0.178056
yr            0.250495
mnth          0.120638
hr            0.394071
holiday      -0.030927
weekday       0.026900
workingday    0.030284
weathersit   -0.142426
temp          0.404772
atemp         0.400929
hum          -0.322911
windspeed     0.093234
casual        0.694564
registered    0.972151
cnt           1.000000
Name: cnt, dtype: float64

In [12]:

def assign_label(hour):
    if hour >=0 and hour < 6:
        return 4
    elif hour >=6 and hour < 12:
        return 1
    elif hour >= 12 and hour < 18:
        return 2
    elif hour >= 18 and hour <=24:
        return 3

bike_rentals["time_label"] = bike_rentals["hr"].apply(assign_label)

Error metric¶

The mean squared error metric makes the most sense to evaluate our error. MSE works on continuous numeric data, which fits our data quite well.

In [13]:

train = bike_rentals.sample(frac=.8)

In [14]:

test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]

In [18]:

from sklearn.linear_model import LinearRegression

predictors = list(train.columns)
predictors.remove("cnt")
predictors.remove("casual")
predictors.remove("registered")
predictors.remove("dteday")

reg = LinearRegression()

reg.fit(train[predictors], train["cnt"])

Out[18]:

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [19]:

import numpy
predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)

Out[19]:

16586.154698429491

Error¶

The error is very high, which may be due to the fact that the data has a few extremely high rental counts, but otherwise mostly low counts. Larger errors are penalized more with MSE, which leads to a higher total error.

In [25]:

from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(min_samples_leaf=5)

reg.fit(train[predictors], train["cnt"])

Out[25]:

DecisionTreeRegressor(compute_importances=None, criterion='mse',
           max_depth=None, max_features=None, max_leaf_nodes=None,
           min_density=None, min_samples_leaf=5, min_samples_split=2,
           random_state=None, splitter='best')

In [26]:

predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)

Out[26]:

2644.2820429330714

In [28]:

reg = DecisionTreeRegressor(min_samples_leaf=2)

reg.fit(train[predictors], train["cnt"])

predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)

Out[28]:

2964.7288070579207

Decision tree error¶

By taking the nonlinear predictors into account, the decision tree regressor appears to have much higher accuracy than linear regression.

In [30]:

from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(min_samples_leaf=5)
reg.fit(train[predictors], train["cnt"])

Out[30]:

RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=5,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0)

In [31]:

predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)

Out[31]:

1911.9827104170736

Random forest error¶

By removing some of the sources of overfitting, the random forest accuracy is improved over the decision tree accuracy.